import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.patches as mpatches
%matplotlib inline
plt.rcParams['figure.figsize'] = [12, 10]
vectorizer = CountVectorizer()
%%time
data = pd.read_csv('Crimes_-_2001_to_present.csv')
After the data is read in the first thing I wanted to do was see exactly how much there was and what it looked like
data.shape
data.head()
data.info()
When working with the data I found that there are about 150-200 outliers when ploting the points based off location. I decided to remove them as 200 points out of over 6.7 million is negligible
data = data[(data.Longitude > -90) & (data.Latitude > 40)]
This graph is misleading in the ratio of arrests to non-arrests. I did this to better see where the arrests were happening
for year in range(2001,2019):
data_year = data[data.Year == year]
arrests = data_year[data_year.Arrest == True]
non_arrests = data_year[data_year.Arrest == False]
p1 = plt.scatter(arrests.Longitude, arrests.Latitude, c="#440154", s=1, alpha=.9)
p2 = plt.scatter(non_arrests.Longitude, non_arrests.Latitude, c="#FDE726", s=1, alpha=.05)
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title(year)
plt.grid(True)
plt.legend((p1,p2),("Arrest made", "No Arrest Made"), scatterpoints=50)
plt.show()
Incidents have gone down over the years with the exception to 2016. I suspect it has something to do with Trumps inauguration.
Arrests have also been about 25% of the total number of incidents over the years
crime_rate = [len(data[data.Year == i]) for i in range(2001,2019)]
arrests = [len(data[(data.Year == year) & (data.Arrest == True)]) for year in range(2001,2019)]
non_arrests = [len(data[(data.Year == year) & (data.Arrest == False)]) for year in range(2001,2019)]
p1 = plt.bar(range(0,len(crime_rate)), arrests)
p2 = plt.bar(range(0,len(crime_rate)), non_arrests, bottom=arrests)
plt.xticks(range(0, len(crime_rate)), range(2001,2019))
plt.xlabel('year')
plt.ylabel('reported incidents')
plt.legend((p1,p2), ('Arrest Made', 'No Arrest Made'))
Theft was considently the top condender with Battery in second and Narcotics and Criminal Damage coming in 3rd and 4th until 2016 where Narcotics dropped and Assault took 4th place.
# Most common crimes by year
# create a color palette
palette = plt.get_cmap('tab20')
# make dataframe
crimes = pd.unique(data['Primary Type'])
years = pd.unique(data['Year'])
years.sort()
dframe = pd.DataFrame(columns=crimes, index=years)
for year in years:
# print(year)
data_year = data[data['Year'] == year]
for crime in crimes:
dframe.at[year,crime] = data_year[data_year['Primary Type'] == crime].shape[0]
# print('making plot')
# multiple line plot
num=0
for column in dframe:
num+=1
plt.plot(dframe.index, dframe[column], marker='', color=palette(num), linewidth=2, alpha=0.9, label=column)
# Add legend
plt.legend(loc=9, bbox_to_anchor=(0.5, -0.1), ncol=4)
plt.xticks(range(2001,2019))
# Add titles
plt.xlabel("Year")
plt.ylabel("Number of Reports")
# create a color palette
palette = plt.get_cmap('tab20')
# make dataframe
districts = pd.unique(data['District'])
districts.sort()
years = pd.unique(data['Year'])
years.sort()
dframe = pd.DataFrame(columns=districts, index=years)
for year in years:
# print(year)
data_year = data[data['Year'] == year]
for district in districts:
dframe.at[year,district] = data_year[data_year['District'] == district].shape[0]
# print('making plot')
# multiple line plot
num=0
for column in dframe:
num+=1
plt.plot(dframe.index, dframe[column], marker='', color=palette(num), linewidth=2, alpha=0.9, label=column)
# Add legend
plt.legend(loc=9, bbox_to_anchor=(0.5, -0.1), ncol=4)
plt.xticks(range(2001,2019))
# Add titles
plt.xlabel("Year")
plt.ylabel("Number of Reports")
AGAIN, the graph is mis-leading with respect to the ratio of arrests to non-arrests
districts = data.District.unique()
districts.sort()
for district in districts:
df = data[data.District == district]
arrest = df[df.Arrest == True]
non_arrest = df[df.Arrest == False]
p1 = plt.scatter(arrest.Longitude, arrest.Latitude, c='#440154', s=1, alpha=.9)
p2 = plt.scatter(non_arrest.Longitude, non_arrest.Latitude, c='#FDE726', s=1, alpha=.05)
plt.title("district number %s" % district)
plt.legend((p1,p2),("Arrest","No Arrest"), scatterpoints=50)
plt.show()
I wonder where the most incidents happen?
districts = data.District.unique()
districts.sort()
plots = []
# create a color palette
palette = plt.get_cmap('tab20')
num = 0
for district in districts:
data_district = data[data.District == district]
num+=1
plots.append(plt.scatter(data_district.Longitude,data_district.Latitude, c=palette(num), s=1, label=district))
plt.legend(scatterpoints=50)
We can see that a relatively small amount of incidents were made in a couple of areas. Them being, the International Airport, and the Internation Golf course. Some other areas with little data are the river going through middle Chicago, and some public parks. A popular area seems to be the Navy Pier.
for year in range(2001,2019):
data_year = data[data.Year == year]
# get x and y
x = data_year.Longitude
y = data_year.Latitude
# definitions for the axes
left, width = 0.1, 0.65
bottom, height = 0.1, 0.65
bottom_h = left_h = left + width + 0.02
rect_scatter = [left, bottom, width, height]
rect_histx = [left, bottom_h, width, 0.2]
rect_histy = [left_h, bottom, 0.2, height]
# start with a rectangular Figure
plt.figure(1, figsize=(10, 10))
axScatter = plt.axes(rect_scatter)
axHistx = plt.axes(rect_histx)
axHisty = plt.axes(rect_histy)
# the scatter plot:
axScatter.scatter(x, y, s=1, alpha=.01)
# now determine nice limits by hand:
binwidth = .001
x_min, x_max = np.min(x), np.max(x)
y_min, y_max = np.min(y), np.max(y)
axScatter.set_xlim((x_min, x_max))
axScatter.set_ylim((y_min, y_max))
x_bins = np.arange(x_min, x_max + binwidth, binwidth)
y_bins = np.arange(y_min, y_max + binwidth, binwidth)
axHistx.hist(x, bins=x_bins)
axHisty.hist(y, bins=y_bins, orientation='horizontal')
axHistx.set_xlim(axScatter.get_xlim())
axHisty.set_ylim(axScatter.get_ylim())
axScatter.grid(True)
axHisty.grid(True)
axHistx.grid(True)
plt.title(year)
plt.show()
The least commond hour of day for an incident is 5am and the most common is around the time when people start leaving work. This doesn't come as a shock since the most common crimes were theft and battery.
time_format = '%m/%d/%Y %I:%M:%S %p'
times = pd.to_datetime(data['Date'], format=time_format)
times.head()
hour_buckets = [0 for i in range(24)]
for hour in times:
hour_buckets[hour.hour] += 1
plt.bar(range(24),hour_buckets)
plt.title('incidents at current hour')
plt.ylabel("number of incidents")
plt.xlabel("hour of day")
plt.xticks(range(24))
plt.show()
If we look at just arrests we can see a different graph. 5am is still the least common but now there is a dip in the old high. This could mean that more small incidents are occuring. I figure this is because people are driving home from work and getting pulled over.
arrests = data[data.Arrest == True]
arrest_times = pd.to_datetime(arrests['Date'], format=time_format)
hour_buckets = [0 for i in range(24)]
for arrest_hour in arrest_times:
hour_buckets[arrest_hour.hour] += 1
plt.bar(range(24),hour_buckets)
plt.title('arrest at current hour')
plt.ylabel("number of arrests")
plt.xlabel("hour of day")
plt.xticks(range(24))
plt.show()